This notebook performs Exploratory Data Analysis on the motor vehicle crashes data aggregated with the ACS data
df <- read_csv("../data/processed/acs_mvc_combined_2018_2023.csv")
glimpse(df)
Rows: 13,984
Columns: 50
$ geoid <dbl> 3.6005e+10, 3.6005e+10, 3.6005e+10, 3.6005e…
$ year <dbl> 2018, 2019, 2020, 2021, 2022, 2023, 2018, 2…
$ total_population <dbl> 47900.000, 46630.000, 44881.000, 45523.000,…
$ pct_male_population <dbl> 13.576200, 13.607120, 13.691763, 13.568965,…
$ pct_female_population <dbl> 1.2045929, 1.1130174, 1.0137920, 1.0631988,…
$ pct_white_population <dbl> 3.7014614, 4.7887626, 5.7039727, 5.8871340,…
$ pct_black_population <dbl> 8.849687, 8.200729, 7.666941, 7.187576, 6.7…
$ pct_asian_population <dbl> 0.26096033, 0.38816213, 0.38100755, 0.38881…
$ pct_hispanic_population <dbl> 4.862213, 5.161913, 5.158085, 4.758034, 3.8…
$ pct_foreign_born <dbl> 2.206681, 2.365430, 2.319467, 2.220855, 1.9…
$ pct_age_under_18 <dbl> 0.3569937, 0.2208878, 0.2161271, 0.2152758,…
$ pct_age_18_34 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_age_35_64 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_age_65_plus <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ median_income <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 5…
$ pct_income_under_25k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_income_25k_75k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_income_75k_plus <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_below_poverty <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_above_poverty <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ poverty_rate <dbl> NA, NA, NA, NA, NA, NA, 22.633201, 22.44042…
$ median_gross_rent <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0…
$ pct_owner_occupied <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_renter_occupied <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_no_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_one_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_two_plus_vehicles <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_less_than_hs <dbl> 6.3924843, 6.0261634, 5.7017446, 5.6608747,…
$ pct_hs_diploma <dbl> 2.0229645, 2.7364358, 3.0547448, 3.4619862,…
$ pct_some_college <dbl> 1.2526096, 1.1237401, 1.0405294, 0.9797245,…
$ pct_associates_degree <dbl> 0.1691023, 0.1737079, 0.2339520, 0.2723898,…
$ pct_bachelors_degree <dbl> 0.14822547, 0.15440703, 0.18938972, 0.23724…
$ pct_graduate_degree <dbl> 0.01670146, 0.01930088, 0.04233417, 0.05272…
$ pct_in_labor_force <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_employed <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_unemployed <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_not_in_labor_force <dbl> 14.780793, 14.720137, 14.705555, 14.632164,…
$ unemployment_rate <dbl> NA, NA, NA, NA, NA, NA, 15.750133, 13.47874…
$ pct_commute_short <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_commute_medium <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_commute_long <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_drive_alone <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_carpool <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_public_transit <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_walk <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_bike <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_work_from_home <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ crash_rate_per_1000 <dbl> 0.04175365, 0.08578169, 0.00000000, 0.00000…
$ injury_rate_per_1000 <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000…
$ fatality_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
df <- df %>%
mutate(
geoid = as.character(geoid),
year = as.integer(year),
total_population = as.integer(total_population)
)
glimpse(df)
Rows: 13,984
Columns: 50
$ geoid <chr> "36005000100", "36005000100", "36005000100"…
$ year <int> 2018, 2019, 2020, 2021, 2022, 2023, 2018, 2…
$ total_population <int> 47900, 46630, 44881, 45523, 30541, 24345, 5…
$ pct_male_population <dbl> 13.576200, 13.607120, 13.691763, 13.568965,…
$ pct_female_population <dbl> 1.2045929, 1.1130174, 1.0137920, 1.0631988,…
$ pct_white_population <dbl> 3.7014614, 4.7887626, 5.7039727, 5.8871340,…
$ pct_black_population <dbl> 8.849687, 8.200729, 7.666941, 7.187576, 6.7…
$ pct_asian_population <dbl> 0.26096033, 0.38816213, 0.38100755, 0.38881…
$ pct_hispanic_population <dbl> 4.862213, 5.161913, 5.158085, 4.758034, 3.8…
$ pct_foreign_born <dbl> 2.206681, 2.365430, 2.319467, 2.220855, 1.9…
$ pct_age_under_18 <dbl> 0.3569937, 0.2208878, 0.2161271, 0.2152758,…
$ pct_age_18_34 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_age_35_64 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_age_65_plus <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ median_income <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 5…
$ pct_income_under_25k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_income_25k_75k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_income_75k_plus <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_below_poverty <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_above_poverty <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ poverty_rate <dbl> NA, NA, NA, NA, NA, NA, 22.633201, 22.44042…
$ median_gross_rent <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0…
$ pct_owner_occupied <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_renter_occupied <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_no_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_one_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_two_plus_vehicles <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_less_than_hs <dbl> 6.3924843, 6.0261634, 5.7017446, 5.6608747,…
$ pct_hs_diploma <dbl> 2.0229645, 2.7364358, 3.0547448, 3.4619862,…
$ pct_some_college <dbl> 1.2526096, 1.1237401, 1.0405294, 0.9797245,…
$ pct_associates_degree <dbl> 0.1691023, 0.1737079, 0.2339520, 0.2723898,…
$ pct_bachelors_degree <dbl> 0.14822547, 0.15440703, 0.18938972, 0.23724…
$ pct_graduate_degree <dbl> 0.01670146, 0.01930088, 0.04233417, 0.05272…
$ pct_in_labor_force <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_employed <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_unemployed <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_not_in_labor_force <dbl> 14.780793, 14.720137, 14.705555, 14.632164,…
$ unemployment_rate <dbl> NA, NA, NA, NA, NA, NA, 15.750133, 13.47874…
$ pct_commute_short <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_commute_medium <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
$ pct_commute_long <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_drive_alone <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_carpool <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_public_transit <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
$ pct_walk <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_bike <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_work_from_home <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ crash_rate_per_1000 <dbl> 0.04175365, 0.08578169, 0.00000000, 0.00000…
$ injury_rate_per_1000 <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000…
$ fatality_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000…
# Visualize missingness
vis_miss(df)
# Count NA values per column
colSums(is.na(df))
geoid year total_population
0 0 0
pct_male_population pct_female_population pct_white_population
426 426 426
pct_black_population pct_asian_population pct_hispanic_population
426 426 426
pct_foreign_born pct_age_under_18 pct_age_18_34
426 426 426
pct_age_35_64 pct_age_65_plus median_income
426 426 0
pct_income_under_25k pct_income_25k_75k pct_income_75k_plus
426 426 426
pct_below_poverty pct_above_poverty poverty_rate
426 426 455
median_gross_rent pct_owner_occupied pct_renter_occupied
0 426 426
pct_no_vehicle pct_one_vehicle pct_two_plus_vehicles
426 426 426
pct_less_than_hs pct_hs_diploma pct_some_college
426 426 426
pct_associates_degree pct_bachelors_degree pct_graduate_degree
426 426 426
pct_in_labor_force pct_employed pct_unemployed
426 426 426
pct_not_in_labor_force unemployment_rate pct_commute_short
426 464 426
pct_commute_medium pct_commute_long pct_drive_alone
426 426 426
pct_carpool pct_public_transit pct_walk
426 426 426
pct_bike pct_work_from_home crash_rate_per_1000
426 426 426
injury_rate_per_1000 fatality_rate_per_1000
426 426
Identify and remove 426 rows with extensive missingness
df$na_rate <- rowMeans(is.na(df))
# Filter out rows with high missingness
df <- df %>%
filter(rowMeans(is.na(.)) <= 0.05)
# Drop 29 rows with poverty_rate NA and/or 38 unemployment_rate NA values
df <- df %>%
drop_na(c(unemployment_rate, poverty_rate))
# Drop na_rate column
df <- df %>% select(-na_rate)
Second quality check
# Visualize missingness
vis_miss(df)
NO NA values!
replace_top_99_with_median <- function(x) {
# Ensure numeric
if (!is.numeric(x)) stop("Column must be numeric.")
# Calculate the 99th percentile and median
upper_threshold <- quantile(x, 0.99, na.rm = TRUE)
median_val <- median(x, na.rm = TRUE)
# Replace values above the 99th percentile with the median
x[x > upper_threshold] <- median_val
return(x)
}
per_1000_vars <- grep("per_1000", names(df), value = TRUE)
df[per_1000_vars] <- lapply(df[per_1000_vars], replace_top_99_with_median)
df <- df %>%
select(-c(poverty_rate,
pct_below_poverty,
pct_above_poverty,
pct_employed,
pct_unemployed,
pct_drive_alone))
df <- df %>%
mutate(
borough = case_when(
substr(geoid, 1, 5) == "36005" ~ "Bronx",
substr(geoid, 1, 5) == "36047" ~ "Brooklyn",
substr(geoid, 1, 5) == "36061" ~ "Manhattan",
substr(geoid, 1, 5) == "36081" ~ "Queens",
substr(geoid, 1, 5) == "36085" ~ "Staten Island",
TRUE ~ "Unknown"
)
)
saveRDS(df, "../data/models/social-risk-crash-rate-data.rds")